## -----------------------------------------------------------------------------------------------
library(tidyverse)
library(foreign)



## -----------------------------------------------------------------------------------------------
D <- readxl::read_excel("../../../data/data_PSID/main/identifiers/CrossYearIndex.xlsx")
D <- D[-1]

index <- read.csv("../../../data/data_PSID/main/identifiers/CrossYearIndex_labels.csv")
names(index) <- c("variable","label","year")
index$label <- str_c(index$label,'_',index$year) 
index <- index[-3]

#everything should be renamed #<- names
for (i in 1:length(D)){
  names(D)[i] = as.character(index[["label"]][i])
}

D <- D %>% pivot_longer(
  cols=!c(in_1968,pn_1968),
  names_to = c("variable","year"),
  names_sep = "_",
  values_to="value")

D <- D %>%
  pivot_wider(names_from=variable,values_from=value) %>%
  mutate(year = as.integer(year))

names(D) <- c("intnum68","pernum","year","relhead",
               "mpair","intnum","sn")

D <- D[,c(1,2,3,6,7,4,5)] %>%
  filter(!is.na(intnum),intnum>0) #<- drop observations where an individual cannot be merged to an interview. We don't need those.

identifiers_panel <- D



## -----------------------------------------------------------------------------------------------
D <- readxl::read_excel("../../../data/data_PSID/main/expenditures/childcare/Main_childcare.xlsx")

# these are all the years that the question is available
years <- c(seq(1988,1997),seq(1999,2017,2))

d <- data.frame(year = integer(), intnum = integer(), childcare_exp = double())

for (i in 1:length(years)) {
  d0 <- data.frame(year = years[i], intnum = D[[(i-1)*3+2]], childcare_exp = D[[(i-1)*3+3]])
  d <- rbind(d,na.omit(d0))
}

# code up missing values
d$childcare_exp[(d$childcare_exp>=99998) & d$year<1999] <- NA
d$childcare_exp[(d$childcare_exp>=999998)] <- NA

main_childcare <- d



## -----------------------------------------------------------------------------------------------
H <- read.csv("../../../data/data_PSID/main/state/State.csv")

index2 <- read.csv("../../../data/data_PSID/main/state/state-labels.csv",header=FALSE)
names(index2) <- c("variable","label","year")
index2$label <- str_c(index2$label,'_',index2$year) 
index2 <- index2[-3]

for (i in 1:length(H)){
  names(H)[i] = as.character(index2[["label"]][i])
}

H <- H %>% select(-contains("relnum")) #dropping release num

H <- H %>%
  pivot_longer(cols=everything(),names_to = c(".value","year"),names_sep = "_") %>%
  mutate(year = as.integer(year)) %>%
  filter(!is.na(intnum)) %>%
  filter(state!=0,state!=99) #<- drop missing observations

H <- H[,c(2,3,1)]

main_state <- H

state_codes <- read.csv("../../../data/data_PSID/StateCodes.csv") %>%
  mutate(X = NULL, X.1 = NULL) %>% #<- this is a simple crosswalk of state codes
  select(-state) %>%
  rename(state = PSID, StFIPS = fips) %>% #<- we rename some things in order to do the merge properly
  select(state, StFIPS, SOI, state_str)


## -----------------------------------------------------------------------------------------------
# ---- Measure 1: using an indicator for whether the individual is living in the state they grew up in
years <- c(1968,seq(1969,1996),seq(1997,2019,2))

d1 <- readxl::read_excel("../../../data/data_PSID/main/parent-location/head_mobility.xlsx")

# create a horizontal file:
head_grow_up = data.frame() 
for (i in seq(1:length(years))) {
  d = data.frame()
  #d$year <- years[i]
  v_intnum <- names(d1)[(i-1)*3+2]
  d <- d1[,v_intnum] %>%
    rename(intnum = v_intnum)
  d$year <- years[i]
  #d$intnum <- d1[,v_intnum]
  v_loc <- names(d1)[i*3]
  d[,"state_grow_up"]<- d1[,v_loc]
  d <- d %>%
    drop_na()
  head_grow_up <- rbind(head_grow_up,d)
}

head_grow_up <-  head_grow_up %>%
  mutate(state_grow_up=na_if(state_grow_up,9))



## -----------------------------------------------------------------------------------------------

# ---- Measure 2: distance to parents in the 1988 main interview using a measure of distance from 1988:

# load and clean the data
# here we create a measure of whether either parent is within 10 miles or within 100 miles
parent_distance <- readxl::read_excel("../../../data/data_PSID/main/parent-location/parent_distance.xlsx") %>%
  rename(                      
         intnum = V14802,#     1988 INTERVIEW NUMBER                   
         head_miles_father = V15816,     # OF MILES TO FATHER                    
         head_miles_mother = V15830,     # OF MILES TO MOTHER                    
         sp_miles_father = V15873,     # OF MILES TO FATHER-W                  
         sp_miles_mother = V15887,     # OF MILES TO MOTHER-W                  
  ) %>%
  mutate(head_ten_miles = head_miles_father<=2 | head_miles_mother<=2,
         head_hundred_miles = head_miles_father<=3 | head_miles_mother<=3,
         sp_ten_miles = sp_miles_father<=2 | sp_miles_mother<=2,
         sp_hundred_miles = sp_miles_father<=3 | sp_miles_mother<=3) %>%
  drop_na()




## -----------------------------------------------------------------------------------------------
relative_present <- read.csv("../../../data/data_PSID/main/family_matrix/family_matrix.csv") %>%
  mutate(KID=MX5*1000 +   MX6)%>%
  #indicator: there is an older relative (grandparent or uncle/aunt) in the HH
  mutate(biol_grand=case_when(MX8==60 ~ 1,
                              #  MX8==94 ~ NA_real_,
                              TRUE ~ 0))%>%
  mutate(older_relative=case_when(MX8>=60 & MX8<=65 ~ 1,
                                  MX8>=71 & MX8<=71 ~ 1,
                                  MX8>=80 & MX8<=81 ~ 1,
                                  MX8==84 ~ 1,
                                  # MX8==94 ~ NA_real_,
                                  TRUE ~ 0))%>%
  group_by(KID, year) %>%
  mutate(older_relative_hh = max(older_relative, na.rm = TRUE), 
         biol_grand_hh = max(biol_grand, na.rm = TRUE) )%>%
  select(KID,year,older_relative_hh,  biol_grand_hh)%>%
  distinct(KID, year, .keep_all = TRUE)



## -----------------------------------------------------------------------------------------------

D <- readxl::read_excel("../../../data/data_PSID/main/race/Race.xlsx") %>% 
  rename(intnum = V3)


# let's create the data just by stacking
# two blocks: years with just head, and years with
years1 <- seq(1969,1984)
years2 <- c(seq(1985,1997),seq(1999,2017,2))

# here are the variable names we want to use, copied and pasted
# NOTE: we switched the order of head and spouse in 1985 (first appearance) in order to do things properly
vnames <- c("V441"   ,"V442"   ,"V801"   ,"V1101"  ,"V1102"  ,"V1490"  ,"V1801"  ,"V1802"  ,"V2202"  ,"V2401"  ,"V2402"  ,"V2828"  ,"V3001"  ,"V3002"  ,"V3300"  ,"V3401"  ,"V3402"  ,"V3720"  ,"V3801"  ,"V3802"  ,"V4204"  ,"V4301"  ,"V4302"  ,"V5096"  ,"V5201"  ,"V5202"  ,"V5662"  ,"V5701"  ,"V5702"  ,"V6209"  ,"V6301"  ,"V6302"  ,"V6802"  ,"V6901"  ,"V6902"  ,"V7447"  ,"V7501"  ,"V7502"  ,"V8099"  ,"V8201"  ,"V8202"  ,"V8723"  ,"V8801"  ,"V8802"  ,"V9408"  ,"V10001" ,"V10002" ,"V11055" ,"V11101" ,"V11102" ,"V12293" ,"V11938" ,"V12501" ,"V12502" ,"V13500" ,"V13565" ,"V13701" ,"V13702" ,"V14547" ,"V14612" ,"V14801" ,"V14802" ,"V16021" ,"V16086" ,"V16301" ,"V16302" ,"V17418" ,"V17483" ,"V17701" ,"V17702" ,"V18749" ,"V18814" ,"V19001" ,"V19002" ,"V20049" ,"V20114" ,"V20301" ,"V20302" ,"V21355" ,"V21420" ,"V21601" ,"V21602" ,"V23212" ,"V23276" ,"ER2001" ,"ER2002" ,"ER3883" ,"ER3944" ,"ER5001" ,"ER5002" ,"ER6753" ,"ER6814" ,"ER7001" ,"ER7002" ,"ER8999" ,"ER9060" ,"ER10001","ER10002","ER11760","ER11848","ER13001","ER13002","ER15836","ER15928","ER17001","ER17002","ER19897","ER19989","ER21001","ER21002","ER23334","ER23426","ER25001","ER25002","ER27297","ER27393","ER36001","ER36002","ER40472","ER40565","ER42001","ER42002","ER46449","ER46543","ER47301","ER47302","ER51810","ER51904","ER53001","ER53002","ER57549","ER57659","ER60001","ER60002","ER64671","ER64810","ER66001","ER66002","ER70744","ER70882")

V <- data.frame()

for (i in seq(1,length(years1))) {
  v1 = vnames[(i-1)*3+2]
  v2 = vnames[i*3]
  d <- select(D,c(v1,v2))
  d <- filter(d,!is.na(d[,v1]))
  names(d) <- c("intnum","race_head")
  d$year <- years1[i]
  d$race_wife <- NA
  V <- rbind(V,d)
  
  
}

for (i in seq(1,length(years2))) {
  j = i + length(years1)
  v1 = vnames[48+(i-1)*4+2]
  v2 = vnames[48+(i-1)*4+3]
  v3 = vnames[48 + i*4]
  d <- select(D,c(v1,v3,v2))
  d <- filter(d,!is.na(d[,v1]))
  names(d) <- c("intnum","race_head","race_wife")
  d$year <- years2[i]
  V <- rbind(V,d)
  
  
}

Ind = identifiers_panel %>% 
  mutate(ID = intnum68*1000 + pernum) %>%
  filter(year<=2011) #<- use only data from 2011 and earlier

V <- merge(V,Ind)
V$Race <- NA
V[V$sn==1,"Race"] <- V[V$sn==1,"race_head"]
V[V$sn==2,"Race"] <- V[V$sn==2,"race_wife"]

D <- V %>%
  filter(!is.na(Race) & Race!=0 & Race!=9) %>%
  group_by(ID) %>%
  arrange(year) %>%
  filter(row_number()==n()) %>% #<- keep the latest record
  select(ID,Race)

race <- D
  


## -----------------------------------------------------------------------------------------------
D <- readxl::read_excel("../../../data/data_PSID/main/education/grades_completed.xlsx") %>%
  mutate(ID = ER30001*1000 + ER30002)
D1 <- D[,seq(5,158,4)]

# this routine pulls out the most recent education variable, up to the year 2011.
# the data itself go until 2017, but we only go up to 2011 to replicate the data used in first draft of CLMP
D$educ <- -1
D$year_meas <- -1
years <- c(1968,seq(1970,1996),seq(1997,2017,2))

# replace with (i in 1:length(years)) if we want to update using measures from 2013-2017
for (i in 1:36) {
  print(i)
  I_use <- (D1[,i]>0) & (D1[,i]<98)
  D[I_use,"educ"] <- D1[I_use,i]
  D[I_use,"year_meas"] <- years[i]
}

# do some final data cleaning and save the file
education <- D %>%
  mutate(educ = na_if(educ,-1), 
         #ed=case_when(educ>16 ~ 5,educ == 16 ~ 4, educ>=13 ~ 3, educ>=12 ~ 2, educ<12 ~ 1)
         ed=case_when(educ>16 ~ ">16",educ == 16 ~ "16", educ>=13 ~ "13-15", educ>=12 ~ "12", educ<12 ~ "<12")
         ) %>%
  select(ID,educ,ed,year_meas) %>%
  #mutate(ed = factor(ed,levels = c(1,2,3,4,5),labels=c("<12","12","13-15","16",">16"))) %>%
  select(ID,ed) #<- just use this measure for now


## -----------------------------------------------------------------------------------------------
G <- readxl::read_excel("../../../data/data_PSID/main/labor-market/earnings-hours.xlsx")

index1 <- read.csv("../../../data/data_PSID/main/labor-market/earnings_labels_clean.csv",header=FALSE)
names(index1) <- c("variable","label","year")
index1$label <- str_c(index1$label,'_',index1$year) 
index1 <- index1[-3]

#everything should be renamed
for (i in 1:length(G)){
  names(G)[i] = as.character(index1[["label"]][i])
}

G <- G %>% select(-contains("relnum")) %>% #dropping the relnum columns
  pivot_longer(cols=everything(),names_to = c(".value","year"),names_sep = "_") %>%
  filter(!is.na(intnum)) %>%
  mutate(year = as.integer(year))

G[c("earnspousebusiness")][is.na(G[c("earnspousebusiness")])] <- 0
#replaces NA business earnings with 0

names(G) <- c("year","intnum","hours_head","hours_spouse",
              "earn_head","earn_spouse","earn_spouseBusiness")

G <- G[,c(2,1,5,6,3,4,7)]

##crude way to remove the variables we don't want, I was checking sequentially
G <- G[!(G$year==1993 & G$hours_head==9999),] 
G <- G[!(G$year==1993 & G$hours_spouse==6730),]
G <- G[!(G$year==1993 & G$hours_spouse==9999),] 
G <- G[!(G$year==1993 & G$earn_head==99999999),] 
G <- G[!(G$year==1994 & G$hours_head==9999),]
G <- G[!(G$year==1994 & G$hours_spouse==9999),]
G <- G[!(G$year==1994 & G$earn_head==9999999),]
G <- G[!(G$year==1994 & G$earn_spouseBusiness==999999),]
G <- G[!(G$year==1994 & G$earn_spouse==9999999),]
G <- G[!(G$year==1995 & G$hours_head==7800),]
G <- G[!(G$year==1995 & G$earn_head==9999999),]
G <- G[!(G$year==1995 & G$earn_spouse==9999999),]
G <- G[!(G$year==1996 & G$hours_spouse==999999),]

earnings_panel <- G


## -----------------------------------------------------------------------------------------------

#changes: add T2 income variables from the individual fils : available up to 2007, no calculated hourly wage after 2001
#changes: add T2 income variables from the family: merge using ID in 2009
#ifif hrs and earnings positive - use earnings : why so many zero wages?


Dt2 <- readxl::read_excel("../../../data/data_PSID/main/labor-market/T2_97_07.xlsx") %>% 
  mutate(MID = ER30001*1000 + ER30002,FID = MID) %>%
  rename(earn97 = ER33536A,earn99 = ER33627A,earn01 = ER33727A) %>%
  rename(earn97u = ER33536B,earn99u = ER33627B,earn01u = ER33727B) %>%
  rename(wage97 = ER33537O,wage99 = ER33628O,wage01 = ER33728O) %>%
  rename(ww97 = ER33536C,ww99 = ER33627C,ww01 = ER33727C) %>%
  rename(hw97 = ER33536Q,hw99 = ER33627Q,hw01 = ER33727Q)%>%
  rename(earn03 = ER33826A,earn05 = ER33926A) %>%
  rename(earn03u = ER33826B ,earn05u = ER33926B) %>%
  rename(hw03 = ER33827U, hw05 = ER33927C ) %>%
  rename(ww03 = ER33827S , ww05 = ER33927A  )



  
# let's just do these for now
D97 <- Dt2 %>% select(MID,FID,earn97,earn97u,wage97,ww97,hw97) %>%
  rename(earn=earn97,earnu=earn97u,wage=wage97,ww=ww97,hw=hw97) %>%
  mutate(year=1997)

D99 <- Dt2 %>% select(MID,FID,earn99,earn99u,wage99,ww99,hw99) %>%
  rename(earn=earn99,earnu=earn99u,wage=wage99,ww=ww99,hw=hw99) %>%
  mutate(year=1999)

D01 <- Dt2 %>% select(MID,FID,earn01,earn01u,wage01,ww01,hw01) %>%
  rename(earn=earn01,earnu=earn01u,wage=wage01,ww=ww01,hw=hw01) %>%
  mutate(year=2001)

D03 <- Dt2 %>% select(MID,FID,earn03,earn03u,ww03,hw03) %>%
  rename(earn=earn03,earnu=earn03u,ww=ww03,hw=hw03) %>%
  mutate(year=2003)

D05 <- Dt2 %>% select(MID,FID,earn05,earn05u,ww05,hw05) %>%
  rename(earn=earn05,earnu=earn05u,ww=ww05,hw=hw05) %>%
  mutate(year=2005)

# --- Now, read in the T-2 in 2001 variables from the family file
Dt01 <- read.csv("../../../data/data_PSID/main/labor-market/T2_supp.csv") %>% 
  rename(intnum = ER21002,h_earn = ER23702F1,h_ww = ER23702D3,h_hw = ER23702E8) %>%
  rename(w_earn = ER23702L4, w_ww = ER23702J6, w_hw = ER23702L2) %>%
  mutate(year = 2003)


#Doownload variables from family files separately from individual files - smaller datasets
# --- Read in the T-2 from family file in 2003
Dt03 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>% #2005 family interview number , 2003 earnings
  rename(intnum = ER25002,h_earn = ER27711F1,h_ww = ER27711D3,h_hw = ER27711E8, h_earnu=ER27711F2) %>%
  rename(w_earn = ER27711L4, w_ww = ER27711J6  , w_hw = ER27711L2, w_earnu= ER27711L5) %>%
  mutate(year = 2005) %>%
  select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>%
  filter(is.na(intnum)==0)


Dt05 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>% #2007 family interview number, 2005 earnings
  rename(intnum = ER36002,h_earn = ER40686F1,h_ww = ER40686D3,h_hw = ER40686E8 , h_earnu=ER40686F2) %>%
  rename(w_earn = ER40686L4, w_ww =  ER40686J6 , w_hw = ER40686L2, w_earnu= ER40686L5 ) %>%
  mutate(year = 2007) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)


# --- Read in the T-2 in 2007 for household heads and wives #2009 family interview number, 2007 earnings
Dt07 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER42002 ,h_earn = ER46673,h_ww = ER46670 ,h_hw = ER46671 , h_earnu=ER46674 ) %>%
  rename(w_earn = ER46684, w_ww = ER46681   , w_hw = ER46682 , w_earnu= ER46685 ) %>%
  mutate(year = 2009) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu) %>% 
  filter(is.na(intnum)==0)


# --- Read in the T-2 in 2009 for household heads and wives #2011 family interview number, 2009 earnings
Dt09 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER47302 ,h_earn = ER52074 ,h_ww = ER52071 ,h_hw = ER52072 , h_earnu= ER52075 ) %>%
  rename(w_earn = ER52085, w_ww = ER52082   , w_hw = ER52083 , w_earnu= ER52086 ) %>%
  mutate(year = 2011) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)


# --- Read in the T-2 in 2011 for household heads and wives #2013 family interview number, 2011 earnings
Dt11 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER53002 ,h_earn = ER57841 ,h_ww = ER57825 ,h_hw = ER57839 , h_earnu= ER57842 ) %>%
  rename(w_earn = ER57889, w_ww = ER57873  , w_hw = ER57887  , w_earnu= ER57890 ) %>%
  mutate(year = 2013) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)




# --- Read in the T-2 in 2013 for household heads and wives #2015 family interview number, 2013 earnings
Dt13 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER60002 ,h_earn = ER65021 ,h_ww = ER65005,h_hw = ER65019 , h_earnu= ER65022) %>%
  rename(w_earn = ER65069 , w_ww = ER65053  , w_hw = ER65067 , w_earnu= ER65070 ) %>%
  mutate(year = 2015) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)

# --- Read in the T-2 in 2015 for household heads and wives #2017 family interview number, 2015 earnings
Dt15 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER66002,h_earn = ER71113 ,h_ww = ER71097,h_hw = ER71111, h_earnu= ER71114) %>%
  rename(w_earn = ER71161, w_ww = ER71145  , w_hw = ER71159 , w_earnu= ER71162) %>%
  mutate(year = 2017) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)

# --- Read in the T-2 in 2017 for household heads and wives #2019 family interview number, 2017 earnings
Dt17 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER72002,h_earn = ER77135 ,h_ww = ER77119,h_hw = ER77133, h_earnu= ER77136) %>%
  rename(w_earn = ER77183 , w_ww = ER77167  , w_hw = ER77181 , w_earnu= ER77184) %>%
  mutate(year = 2019) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)


D01 <- select(D01,c("MID","FID","year","earn","earnu","wage","hw","ww"))
D03 <- select(D03,c("MID","FID","year","earn","earnu","hw","ww"))
D05 <- select(D05,c("MID","FID","year","earn","earnu","hw","ww"))

d01 <- identifiers_panel %>% 
  filter(year==2003) %>% 
  mutate(MID = intnum68*1000 + pernum) %>%
  merge(Dt01) %>%
  mutate(year = 2001)

d03 <- identifiers_panel %>% 
  filter(year==2005) %>% 
  mutate(MID = intnum68*1000 + pernum) %>%
  merge(Dt03) %>%
  mutate(year = 2003)

d05 <- identifiers_panel %>% 
  filter(year==2007) %>% 
  mutate(MID = intnum68*1000 + pernum) %>%
  merge(Dt05) %>%
  mutate(year = 2005)



D01 <- merge(D01,d01)
Ih = D01$sn==1
D01[Ih,c("earn","hw","ww")] = D01[Ih,c("h_earn","h_hw","h_ww")]
Iw = D01$sn==2
D01[Iw,c("earn","hw","ww")] = D01[Iw,c("w_earn","w_hw","w_ww")]
D01 <- select(D01,c("MID","FID","year","earn","earnu","wage","hw","ww"))

D03 <- merge(D03,d03)
Ih = D03$sn==1
D03[Ih,c("earn","hw","ww")] = D03[Ih,c("h_earn","h_hw","h_ww")]
Iw = D03$sn==2
D03[Iw,c("earn","hw","ww")] = D03[Iw,c("w_earn","w_hw","w_ww")]
D03 <- D03%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D03 <- select(D03,c("MID","FID","year","earn","earnu","wage","hw","ww"))

D05 <- merge(D05,d05)
Ih = D05$sn==1
D05[Ih,c("earn","hw","ww")] = D05[Ih,c("h_earn","h_hw","h_ww")]
Iw = D05$sn==2
D05[Iw,c("earn","hw","ww")] = D05[Iw,c("w_earn","w_hw","w_ww")]
D05 <- D05%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D05 <- select(D05,c("MID","FID","year","earn","earnu","wage","hw","ww"))

#labor market outcomes 2 years ago are not reported in the individual file for 2007, so I use family file and generate MID and FID by merging with the identifiers file by family interview number
D07 <- identifiers_panel %>% 
  filter(year==2009) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt07, by = c("intnum","year")) %>%
  mutate(year = 2007)

D09 <- identifiers_panel %>% 
  filter(year==2011) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt09, by = c("intnum","year")) %>%
  mutate(year = 2009)

D11 <- identifiers_panel %>% 
  filter(year==2013) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt11, by = c("intnum","year")) %>%
  mutate(year = 2011)

D13 <- identifiers_panel %>% 
  filter(year==2015) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt13, by = c("intnum","year")) %>%
  mutate(year = 2013)

D15 <- identifiers_panel %>% 
  filter(year==2017) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt15, by = c("intnum","year")) %>%
  mutate(year = 2015)

D17 <- identifiers_panel %>% 
  filter(year==2019) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt17, by = c("intnum","year")) %>%
  mutate(year = 2017)

Ih = D07$sn==1
D07[Ih,c("earn","hw","ww","earnu")] = D07[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D07$sn==2
D07[Iw,c("earn","hw","ww","earnu")] = D07[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D07 <- D07%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D07 <- select(D07,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D09$sn==1
D09[Ih,c("earn","hw","ww","earnu")] = D09[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D09$sn==2
D09[Iw,c("earn","hw","ww","earnu")] = D09[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D09 <- D09%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D09 <- select(D09,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D11$sn==1
D11[Ih,c("earn","hw","ww","earnu")] = D11[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D11$sn==2
D11[Iw,c("earn","hw","ww","earnu")] = D11[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D11 <- D11%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D11 <- select(D11,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D13$sn==1
D13[Ih,c("earn","hw","ww","earnu")] = D13[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D13$sn==2
D13[Iw,c("earn","hw","ww","earnu")] = D13[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D13 <- D13%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D13 <- select(D13,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D15$sn==1
D15[Ih,c("earn","hw","ww","earnu")] = D15[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D15$sn==2
D15[Iw,c("earn","hw","ww","earnu")] = D15[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D15 <- D15%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D15 <- select(D15,c("MID","FID","year","earn","earnu","wage","hw","ww"))

Ih = D17$sn==1
D17[Ih,c("earn","hw","ww","earnu")] = D17[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D17$sn==2
D17[Iw,c("earn","hw","ww","earnu")] = D17[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D17 <- D17%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D17 <- select(D17,c("MID","FID","year","earn","earnu","wage","hw","ww"))

earnings_panel_supplement <- rbind(D97,D99,D01,D03,D05,D07,D09,D11,D13,D15,D17) %>%
  mutate(earn = na_if(na_if(na_if(na_if(earn,-9999999),-999999),99999999),9999999)) %>%
  mutate(earn = na_if(na_if(na_if(earn,-999998),9999998),99999998))%>%
  #mutate(earn = case_when(earnu==1 ~ earn*hw*ww,earnu==2 ~ earn*365,earnu==3 ~ earn*ww, earnu==4 ~ earn*ww/2,earnu==5 ~ earn*12, earnu==6 ~ earn, TRUE ~ NA_real_)) %>%
  mutate(ww = na_if(na_if(ww,99),98),hw=na_if(na_if(hw,999),998)) %>%
  mutate(hrs = ww*hw) %>%
  #mutate(wage2 = case_when(earn>0 & hrs>0 ~ earn/hrs,TRUE ~ NA_real_),0) %>%
  mutate(wage = case_when(wage<=0 ~ NA_real_,wage==999 ~ NA_real_,TRUE ~ wage)) %>%
  mutate(wage = case_when(earn>0 & earnu==6 & hrs>0 & is.na(wage) ~ earn/hrs, TRUE ~ wage))


## -----------------------------------------------------------------------------------------------


## -----------------------------------------------------------------------------------------------

Income_ly <- readxl::read_excel("../../../data/data_PSID/main/total_income/total_income.xlsx") %>%
rename(intnum97=ER10002, tax_income97=ER12069, total_income97=ER12079) %>%
  rename(intnum99=ER13002, tax_income99=ER16452, total_income99=ER16462) %>%
  rename(intnum01=ER17002, tax_income01=ER20449, total_income01=ER20456) %>%
  rename(intnum03=ER21002, tax_income03=ER24100, total_income03=ER24099) %>%
  rename(intnum05=ER25002, tax_income05=ER27953, total_income05=ER28037) %>%
  rename(intnum07=ER36002, tax_income07=ER40943, total_income07=ER41027)
  

Income_T2 <- readxl::read_excel("../../../data/data_PSID/main/total_income/total_income.xlsx") %>%
rename(intnum99=ER13002, total_income97=ER16219) %>%
rename(intnum01=ER17002, total_income99=ER20165) %>%
rename(intnum03=ER21002, total_income01=ER23764)


# Separate files by year
I97 <- Income_ly %>% select(intnum97,tax_income97,total_income97) %>%
  rename(intnum=intnum97,tax_income=tax_income97,total_income=total_income97) %>%
  mutate(year=1997)

I99 <- Income_ly %>% select(intnum99,tax_income99,total_income99) %>%
  rename(intnum=intnum99,tax_income=tax_income99,total_income=total_income99) %>%
  mutate(year=1999)

I99_T2 <- Income_T2 %>% select(intnum99,total_income97) %>%
  rename(intnum=intnum99,total_income=total_income97) %>%
  mutate(year=1999, tax_income=NA_real_)


I01 <- Income_ly %>% select(intnum01,tax_income01,total_income01) %>%
  rename(intnum=intnum01,tax_income=tax_income01,total_income=total_income01) %>%
  mutate(year=2001)

I01_T2 <- Income_T2 %>% select(intnum01,total_income99) %>%
  rename(intnum=intnum01,total_income=total_income99) %>%
  mutate(year=2001, tax_income=NA_real_)

I03 <- Income_ly %>% select(intnum03,tax_income03,total_income03) %>%
  rename(intnum=intnum03,tax_income=tax_income03,total_income=total_income03) %>%
  mutate(year=2003)

I03_T2 <- Income_T2 %>% select(intnum03,total_income01) %>%
  rename(intnum=intnum03,total_income=total_income01) %>%
  mutate(year=2003, tax_income=NA_real_)

I05 <- Income_ly %>% select(intnum05,tax_income05,total_income05) %>%
  rename(intnum=intnum05,tax_income=tax_income05,total_income=total_income05) %>%
  mutate(year=2005)

I07 <- Income_ly %>% select(intnum07,tax_income07,total_income07) %>%
  rename(intnum=intnum07,tax_income=tax_income07,total_income=total_income07) %>%
  mutate(year=2007)




i97 <- identifiers_panel %>% 
  filter(year==1997) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I97) %>%
  mutate(year = 1996)

i99 <- identifiers_panel %>% 
  filter(year==1999) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I99) %>%
  mutate(year = 1998)

i99_T2 <- identifiers_panel %>% 
  filter(year==1999) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I99_T2) %>%
  mutate(total_income=na_if(na_if(total_income,9999999),9999998))%>%
  mutate(year = 1997)

i01 <- identifiers_panel %>% 
  filter(year==2001) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I01) %>%
  mutate(year = 2000)

i01_T2 <- identifiers_panel %>% 
  filter(year==2001) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I01_T2) %>%
    mutate(total_income=na_if(na_if(total_income,999999999),999999998))%>%
  mutate(year = 1999)

i03 <- identifiers_panel %>% 
  filter(year==2003) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I03) %>%
  mutate(year = 2002)

i03_T2 <- identifiers_panel %>% 
  filter(year==2003) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I03_T2) %>%
    mutate(total_income=na_if(na_if(total_income,999999999),999999998))%>%
  mutate(year = 2001)

i05 <- identifiers_panel %>% 
  filter(year==2005) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I05) %>%
  mutate(year = 2004)

i07 <- identifiers_panel %>% 
  filter(year==2007) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I07) %>%
  mutate(year = 2006)


total_income <- rbind(i97,i99,i99_T2,i01,i01_T2,i03,i03_T2,i05,i07) %>%
  select(-intnum)


## -----------------------------------------------------------------------------------------------

D <- readxl::read_excel("../../../data/data_PSID/cds/assessments/ChildAssessments.xlsx") %>% 
  mutate(KID = ER30001*1000 + ER30002)

# first we want to arrange the data nicely. We have the same scores for each year. They are:
col_names <- c("LW_std","LW_raw","PC_std","PC_raw","AP_std","AP_raw","DigSpan","BPE","BPN")

# 3 vectors with the corresponding variable names for each year
names97 <- c("Q3LW_SS","Q3LWRAW","Q3PC_SS","Q3PCRAW","Q3AP_SS","Q3APRAW","Q3DSTOT","BPI_E97","BPI_N97")

names02 <- c("Q24LWSS","Q24LWRAW","Q24PCSS","Q24PCRAW","Q24APSS","Q24APRAW","Q24DSTO","BPI_E02","BPI_N02")


names07 <- c("Q34LWSS","Q34LWRAW","Q34PCSS","Q34PCRAW","Q34APSS","Q34APRAW","Q34DSTO","BPI_E07","BPI_N07")

# separate and recombine files in a panel
D97 <- D[,c("KID",names97)]
names(D97)[2:10] <- col_names
D97$year <- 1997

D02 <- D[,c("KID",names02)]
names(D02)[2:10] <- col_names
D02$year <- 2002

D07 <- D[,c("KID",names07)]
names(D07)[2:10] <- col_names
D07$year <- 2007

D <- rbind(D97,D02,D07) %>%
  as.data.frame()

# Finally, replace missing values
missvals <- c(999,99,999,99,999,99,99,99,99)
for (i in 1:9) {
  D[,col_names[i]] <- na_if(D[,col_names[i]],missvals[i])
}

assessment_panel <- D



## -----------------------------------------------------------------------------------------------
m2 <- readxl::read_excel("../../../data/data_PSID/cds/assessments/PCG-PC.xlsx") %>% 
  mutate(KID = ER30001*1000 + ER30002) %>% 
  select(KID,Q1PCSS,Q1PCRAW)%>%
  rename(m_pc_st= Q1PCSS, m_pc_raw= Q1PCRAW)

# load the auxiliary data and create the Kid ID and the PCG ID.
# then merge by Kid ID with the PCG's test score data
# then drop missing observations
cds <- readxl::read_excel("../../../data/data_PSID/CDS-aux-info.xlsx") %>%
  mutate(KID = CDSCUMID68*1000 + CDSCUMPN,MID = ID68PCG97*1000 + PNPCG97) %>%
  select(KID,MID) %>%
  merge(m2) %>%
  filter(m_pc_st>0) %>%
  select(MID,m_pc_st,m_pc_raw) %>%
  unique()

# there are a small handful of observations where the raw score is the same but the conversion to standardized score differs slightly, in this case we just keep the first observation
cds <- cds %>%
  group_by(MID) %>%
  filter(row_number()==1)

passage_comprehension <- cds



## -----------------------------------------------------------------------------------------------
# 300-399 is obtaining goods/services (remove?)
# 919 tv
T0 <- read.csv("../../../data/data_PSID/cds/time-diary/TD97full.csv") %>%
  mutate(KID=ER30001*1000 + ER30002,year = 1997) %>%
  select(KID,year,COLA,COLG_B,COLG_C,COLG_I,COLG_J,COLG_K,COLH_B,COLH_C,COLH_I,COLH_J,COLH_K,COLF,WDAYWEND,DURATION, COLJ)



T1 <- read.csv("../../../data/data_PSID/cds/time-diary/TD02full.csv") %>%
  mutate(KID=ER30001*1000 + ER30002,year = 2002) %>%
  rename(COLG_B = COLGB_02, COLG_C = COLGC_02 , COLG_I = COLGI_02,COLG_J = COLGJ_02,COLG_K = COLGK_02, COLH_B = COLHB_02, COLH_C = COLHC_02, COLH_I = COLHI_02,COLH_J = COLHJ_02,COLH_K = COLHK_02, WDAYWEND = DIARY_02, DURATION = DUR_02, COLA = COLA_02,COLF = COLF_02, COLJ=COLJ_02) %>%
  select(KID,year,COLA,COLG_B,COLG_C,COLG_I,COLG_J,COLG_K,COLH_B,COLH_C,COLH_I,COLH_J,COLH_K,COLF,WDAYWEND,DURATION, COLJ) %>%
  mutate(COLA = floor(COLA/10)) #<- have to round down

T2 <- read.csv("../../../data/data_PSID/cds/time-diary/TD07full.csv")%>%
  mutate(KID=ER30001*1000 + ER30002,year = 2007) %>%
  rename(COLG_B = COLHB_07, COLG_C = COLHC_07 , COLH_B = COLIB_07, COLH_C = COLIC_07, WDAYWEND = DIARY_07, DURATION = DUR_07, COLA = COLA_07, 
         COLG_I= COLHI_07  , COLG_J= COLHJ_07, COLG_K= COLHK_07, COLH_I= COLII_07 ,COLH_J= COLIJ_07,COLH_K= COLIK_07,COLF = COLG_07, COLJ=COLJ_07) %>%
   select(KID,year,COLA,COLG_B,COLG_C,COLG_I,COLG_J,COLG_K,COLH_B,COLH_C,COLH_I,COLH_J,COLH_K,COLF,WDAYWEND,DURATION, COLJ) %>%
  mutate(COLG_B = if_else(COLG_B==5, as.integer(0), as.integer(COLG_B)),COLG_C = if_else(COLG_C==5, as.integer(0), as.integer(COLG_C)),
         COLH_B = if_else(COLH_B==5, as.integer(0), as.integer(COLH_B)), COLH_C = if_else(COLH_C==5, as.integer(0), as.integer(COLH_C)),#<- 5 corresponds to activity being personal, private, school, or work according to the codebook - I make it 0
         COLA = floor(COLA/10), COLJ = floor(COLJ/10)) #<- have to round down

TD <- rbind(T0,T1,T2) %>%
  mutate(time2ind = COLA!=919) %>% #<-
  mutate(COLG_B = na_if(COLG_B,9),COLG_C = na_if(COLG_C,9),COLH_B = na_if(COLH_B,9), COLH_C = na_if(COLH_C,9),COLG_I = na_if(COLG_I,9),COLG_J = na_if(COLG_J,9),COLH_K = na_if(COLH_K,9),COLH_I = na_if(COLH_I,9),COLH_J = na_if(COLH_J,9),COLH_K = na_if(COLH_K,9)) #<- code in missing categories





## -----------------------------------------------------------------------------------------------

#create a dummy variable equal to 1 if parents are engaged into the "investment" activity

TD$investment=ifelse(TD$COLA==258|TD$COLA==222|TD$COLA==249|TD$COLA==259|TD$COLA==238|
                       TD$COLA==339|TD$COLA==411|TD$COLA==488|TD$COLA==501|TD$COLA==504|
                       TD$COLA==511|TD$COLA==519|TD$COLA==549|TD$COLA==569|
                       (TD$COLA>=709&TD$COLA<=749)|(TD$COLA>=801&TD$COLA<=889)|
                       TD$COLA==939|TD$COLA==941|TD$COLA==959|TD$COLA==942|
                       TD$COLA==943|TD$COLA==962|TD$COLA==967|TD$COLA==979|TD$COLA==963,1,0)

#create an dummy variable equal to 1 if parents are engaged into the "socializing" activity
TD$social=ifelse(TD$COLA==439|TD$COLA==448|TD$COLA==449|(TD$COLA>=601&TD$COLA<=689)|
                   TD$COLA==752|TD$COLA==769|TD$COLA==771|TD$COLA==772|TD$COLA==789,1,0)

#create expanded investment and social dummy variables


TD$ex_soc=ifelse(TD$COLA==108|TD$COLA==109|TD$COLA==118|TD$COLA==119|TD$COLA==698|TD$COLA==699|TD$COLA==799|TD$social==1,1,0)


TD$ex_inv=ifelse(TD$investment==1|
                   TD$COLA==248|TD$COLA==221|TD$COLA==239|TD$COLA==377|
                   TD$COLA==484|TD$COLA==510|TD$COLA==512|TD$COLA==599|
                   TD$COLA==597|TD$COLA==598|TD$COLA==899,1,0)

# a dummy if no parent is around 
TD$no_parent = (TD$COLG_B==0) & (TD$COLG_C==0) & (TD$COLH_B==0) & (TD$COLH_C==0)


# create a formal, non-relative care variable
TD$formal = TD$no_parent & (TD$COLF==84)

# create a relative care variable
TD$relative = TD$no_parent & (TD$COLG_I==1 | TD$COLG_J==1 | TD$COLH_I==1 | TD$COLH_J==1)
# create a relative present variable
TD$relative_present = (TD$COLG_I==1 | TD$COLG_J==1 | TD$COLH_I==1 | TD$COLH_J==1)


# create an informal non-relative care variable. Defined as time in non-relative care that does not occur in a formal center and does not occur at school

TD$informal = TD$no_parent & !TD$relative & (TD$COLF!=84) & (TD$COLF!=80) & (TD$COLG_K==1 | TD$COLH_K==1)



## -----------------------------------------------------------------------------------------------
TD$wght = 2 + 3*TD$WDAYWEND


## -----------------------------------------------------------------------------------------------
TD_agg <- TD %>% 
  group_by(KID,year) %>% 
  summarise(tau_m = sum(DURATION*wght*COLG_B,na.rm = TRUE)/3600, tau_f = sum(DURATION*wght*COLG_C,na.rm=TRUE)/3600, tau_both = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C),na.rm=TRUE)/3600, tau_m2 = sum(time2ind*DURATION*wght*COLG_B,na.rm = TRUE)/3600, tau_f2 = sum(time2ind*DURATION*wght*COLG_C,na.rm=TRUE)/3600,
            tau_m_investment = sum(DURATION*wght*COLG_B*investment,na.rm = TRUE)/3600,  tau_m_social = sum(DURATION*wght*COLG_B*social,na.rm = TRUE)/3600, tau_m_ex_inv = sum(DURATION*wght*COLG_B*ex_inv,na.rm = TRUE)/3600,  tau_m_ex_soc = sum(DURATION*wght*COLG_B*ex_soc,na.rm = TRUE)/3600, 
            tau_f_investment = sum(DURATION*wght*COLG_C*investment,na.rm = TRUE)/3600,  tau_f_social = sum(DURATION*wght*COLG_C*social,na.rm = TRUE)/3600, tau_f_ex_inv = sum(DURATION*wght*COLG_C*ex_inv,na.rm = TRUE)/3600, tau_f_ex_soc = sum(DURATION*wght*COLG_C*ex_soc,na.rm = TRUE)/3600,
            tau_both_investment = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*investment,na.rm = TRUE)/3600,  tau_both_social = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*social,na.rm = TRUE)/3600, tau_both_ex_inv = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*ex_inv,na.rm = TRUE)/3600, tau_both_ex_soc = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*ex_soc,na.rm = TRUE)/3600,
            formal = sum(DURATION*wght*formal,na.rm = TRUE)/3600,informal = sum(DURATION*wght*informal,na.rm = TRUE)/3600,relative = sum(DURATION*wght*relative,na.rm = TRUE)/3600,relative_present = sum(DURATION*wght*relative_present,na.rm = TRUE)/3600)


#Create expanded social and expanded investment measure
TD_agg <- TD_agg %>% 
  mutate(tau_m_socinvest=tau_m_ex_inv+tau_m_ex_soc,
         tau_f_socinvest=tau_f_ex_inv+tau_f_ex_soc,
         tau_both_socinvest=tau_both_ex_inv+tau_both_ex_soc)



## -----------------------------------------------------------------------------------------------
fr97 <- readxl::read_excel("../../../data/data_PSID/cds/time-diary/family_friends/family_friends.xlsx") %>% 
  mutate(KID=ER30001*1000 + ER30002,year = 1997, Q1B7=case_when(Q1B7>=8 ~ NA_real_, TRUE ~Q1B7),
         live_grand=case_when(BIOGPR97==1 ~ 1,BIOGPR97==2 ~ 0,BIOGPR97==3 ~ 0, TRUE ~ NA_real_)) %>%
  select(KID, year,Q1B7,live_grand)%>%
  rename(freq_family=Q1B7)

#use grandparents living/not living with child in 2001 for 2002 (2003 is also available)
fr02 <- readxl::read_excel("../../../data/data_PSID/cds/time-diary/family_friends/family_friends.xlsx") %>% 
  mutate(KID=ER30001*1000 + ER30002,year = 2002, Q21E6=case_when(Q21E6>=8 ~ NA_real_, TRUE ~Q21E6),
        live_grand=case_when(BIOGPR01==1 ~ 1,BIOGPR01==2 ~ 0,BIOGPR01==3 ~ 0,BIOGPR01==4 ~ 0, TRUE ~ NA_real_)) %>%
  select(KID, year,Q21E6,live_grand)%>%
  rename(freq_family=Q21E6)

fr07 <- readxl::read_excel("../../../data/data_PSID/cds/time-diary/family_friends/family_friends.xlsx") %>% 
  mutate(KID=ER30001*1000 + ER30002,year = 2007, Q31E6=case_when(Q31E6>=8 ~ NA_real_, TRUE ~Q31E6),
              live_grand=case_when(BIOGPR07==1 ~ 1,BIOGPR07==2 ~ 0,BIOGPR07==3 ~ 0,BIOGPR07==4 ~ 0, TRUE ~ NA_real_)) %>%
  select(KID, year,Q31E6,live_grand)%>%
  rename(freq_family=Q31E6)

#stack into a panel
fr <- rbind(fr97,fr02,fr07)

#merge with panel of time
TD_agg <- TD_agg %>%
  left_join(fr)



## -----------------------------------------------------------------------------------------------

E <- read.csv("../../../data/data_PSID/cds/expenditures/Exp_0207.csv")
#generate kid's ID
E <- E %>%
  mutate(KID = ER30001*1000 + ER30002)

#First select kids in 2002 sample and variables describing 2002 hh expenditures

E02<-E[E$KID02==1.1,] %>%select(KID,starts_with("Q21"))
#Substitute to missing if the primary caregiver did not know about costs
E02[(E02$Q21H23A>=8|is.na(E02$Q21H23A)==1|E02$Q21H23A1>=99998),"Q21H23A1"]<-NA
E02[(E02$Q21H23B>=8|is.na(E02$Q21H23B)==1|E02$Q21H23B1>=99998),"Q21H23B1"]<-NA
E02[(E02$Q21H23C>=8|is.na(E02$Q21H23C)==1|E02$Q21H23C1>=9998),"Q21H23C1"]<-NA
E02[(E02$Q21H23D>=8|is.na(E02$Q21H23D)==1|E02$Q21H23D1>=9998),"Q21H23D1"]<-NA
E02[(E02$Q21H23H>=8|is.na(E02$Q21H23H)==1|E02$Q21H23H1>=99998),"Q21H23H1"]<-NA

E02 <- E02 %>%
  rename(Toys = Q21H23A1, Vacation = Q21H23B1, SchSupplies = Q21H23C1, Clothing = Q21H23D1, Food = Q21H23H1) %>%
  mutate(Toys = Toys/52, Vacation = Vacation/52, SchSupplies = SchSupplies/52, Clothing = Clothing/52, Food = Food/52)


#Substitute to missing if the primary caregiver did not know about costs
E02$Q21G5D[(E02$Q21G5D>=9998)] <- NA
E02$Q21H5C[(E02$Q21H5C>=9998)] <- NA
E02$Q21G10C[(E02$Q21G10C>=9998)] <- NA
E02$Q21H8B[(E02$Q21H8B>=9998)] <- NA
E02$Q21G8I[(E02$Q21G8I>=9998)] <- NA
E02$Q21H7F[(E02$Q21H7F>=9998)] <- NA
E02$Q21G7C [(E02$Q21G7C >=9998)] <- NA
E02$Q21H6B[(E02$Q21H6B>=9998)] <- NA
E02$Q21B12A1[(E02$Q21B12A1>=99998)] <- NA


#Combine expenditure on children in diffenet age groups, generate annual tuition cost
E02 <- E02 %>%
  mutate(year = 2002 ) %>%
  mutate(tutoring = Q21G5D + Q21H5C) %>%
  mutate(comm_grps = (Q21G10C + Q21H8B)) %>%
  mutate(sports = (Q21G8I + Q21H7F)) %>%
  mutate(lessons = (Q21G7C + Q21H6B)) %>%
  mutate(tuition = Q21B12A1)%>%
  mutate(tuition = case_when(Q21B12A2==3 ~ 50*as.numeric(tuition),Q21B12A2==5 ~ 12*as.numeric(tuition), TRUE ~ as.numeric(tuition)))%>%
  select(KID, year, tuition, lessons, tutoring, comm_grps, sports, Toys, Vacation, SchSupplies, Clothing, Food)
    

#Select kids in 2007 sample and variables describing 2007 hh expenditures

E07<-E[E$KID07==1.1,] %>%select(KID,starts_with("Q31"))
#Substitute to missing if the primary caregiver did not not about costs
E07[(E07$Q31H23A>=8|is.na(E07$Q31H23A)==1|E07$Q31H23A1>=99998),"Q31H23A1"]<-NA
E07[(E07$Q31H23B>=8|is.na(E07$Q31H23B)==1|E07$Q31H23B1>=99998),"Q31H23B1"]<-NA
E07[(E07$Q31H23C>=8|is.na(E07$Q31H23C)==1|E07$Q31H23C1>=99998),"Q31H23C1"]<-NA
E07[(E07$Q31H23D>=8|is.na(E07$Q31H23D)==1|E07$Q31H23D1>=99998),"Q31H23D1"]<-NA
E07[(E07$Q31H23H>=8|is.na(E07$Q31H23H)==1|E07$Q31H23H1>=99998),"Q31H23H1"]<-NA

E07 <- E07 %>%
  rename(Toys = Q31H23A1, Vacation = Q31H23B1, SchSupplies = Q31H23C1, Clothing = Q31H23D1, Food = Q31H23H1) %>%
  mutate(Toys = Toys/52, Vacation = Vacation/52, SchSupplies = SchSupplies/52, Clothing = Clothing/52, Food = NA)



E07$Q31H5C[(E07$Q31H5C>=9998)] <- NA
E07$Q31H8B[(E07$Q31H8B>=9998)] <- NA
E07$Q31H7F[(E07$Q31H7F>=9998)] <- NA
E07$Q31H6B[(E07$Q31H6B>=9998)] <- NA
E07$Q31B12A1[(E07$Q31B12A1>=99998)] <- NA

#In 2007 children the information is collected for children 10+, generate annual tuition cost
E07 <- E07 %>%
  mutate(year = 2007) %>%
  mutate(tutoring = Q31H5C) %>%
  mutate(comm_grps = Q31H8B) %>%
  mutate(sports = Q31H7F) %>%
  mutate(lessons = Q31H6B) %>%
  mutate(tuition = Q31B12A1)%>%
  mutate(tuition = case_when(Q31B12A2==3 ~ 50*as.numeric(tuition),Q31B12A2==5 ~ 12*as.numeric(tuition), TRUE ~ as.numeric(tuition)))%>%
  select(KID,year,Toys,Vacation,SchSupplies,Clothing,Food,sports,lessons,comm_grps,tutoring,tuition)

#Merge 2002 and 2007 expenditure data, transform annual expenditures to weekly

E <- rbind(E07, E02)
E=E%>%mutate(tuition=tuition/52, 
         comm_grps=comm_grps/52,
         lessons=lessons/52,
         tutoring=tutoring/52,
         sports=sports/52)

#Drop children with missing ID
E <- E[is.na(E$KID)==0,]

cols <- c("KID","year","Toys","Vacation","SchSupplies","Clothing","Food","sports","lessons","comm_grps","tutoring","tuition")

cds_expenditures <- E[,cols]




## -----------------------------------------------------------------------------------------------
PPE <- readxl::read_excel("../../../data/data_PSID/cds/expenditures/CDS_PPE.xlsx") %>%
  mutate(KID = ER30001*1000 + ER30002) %>%
  rename(PPE = Q12A27) %>%
  mutate(PPE = case_when(PPE<99998 ~ PPE)) %>% #<- code in missing values
  select(KID,PPE) %>%
  mutate(year = 1997) #<- add the year for merging into a panel.


## -----------------------------------------------------------------------------------------------
Ch97 <- readxl::read_excel("../../../data/data_PSID/cds/childcare/chcare_9707.xlsx") %>% #<- read in the data
  mutate(KID = ER30001*1000 + ER30002) %>%
  mutate(Q1H21 = case_when(Q1H21<9998 ~ Q1H21),Q1H18 = case_when(Q1H18<8 ~ Q1H18),Q1H19=case_when(Q1H19<998 ~ Q1H19),Q1H21A=case_when(Q1H21A<8 ~ Q1H21A)) %>% #<- fill in missing values 
  mutate(Q1H27 = case_when(Q1H27<9998 ~Q1H27 ),Q1H24= case_when(Q1H24<8 ~ Q1H24),Q1H25=case_when(Q1H25<98 ~ Q1H25),Q1H27A=case_when(Q1H27A<8 ~ Q1H27A)) %>% #<- fill in missing values for second arrangement
  mutate(chcareExp97 = case_when(Q1H21A==1 ~ Q1H21*Q1H19, #, #<- cost reported hourly
                                 Q1H21A==2 ~ Q1H21*Q1H18,  #<- reported daily
                                 Q1H21A==4 ~ Q1H21/2, #<- reported fortnightly
                                 Q1H21A==5 ~ Q1H21/(52/12), #<- reported monthly
                                 Q1H21A==6 ~ Q1H21/52, #<-reported annually
                                 TRUE ~ Q1H21)) %>% #<- reported weekly
    mutate(chcareExpSecond97 = case_when(Q1H27A==1 ~ Q1H27*Q1H25, #, #<- cost reported hourly
                                 Q1H27A==2 ~ Q1H27*Q1H24,  #<- reported daily
                                 Q1H27A==4 ~ Q1H27/2, #<- reported fortnightly
                                 Q1H27A==5 ~ Q1H27/(52/12), #<- reported monthly
                                 Q1H27A==6 ~ Q1H27/52, #<-reported annually
                                 TRUE ~ Q1H27)) %>% #<- reported weekly
  mutate(Price97 = chcareExp97/Q1H19) %>% #<- derive price
  mutate(PriceSecond97 = chcareExpSecond97/Q1H25) %>% #<- derive price
  mutate(Arrange97 = case_when(Q1H14==1 | Q1H14==2 | Q1H14==4 ~ "Relative",
                               Q1H14==3 ~ "Non Relative",
                               Q1H14==5 ~ "Family Center",
                               Q1H14==7 ~ "Care Center",
                               Q1H14==8 ~ "Before/After School Prog."))%>% 
  mutate(ArrangeSecond97 = case_when(Q1H15==1 | Q1H15==2 | Q1H15==4 ~ "Relative",
                               Q1H15==3 ~ "Non Relative",
                               Q1H15==5 ~ "Family Center",
                               Q1H15==7 ~ "Care Center",
                               Q1H15==8 ~ "Before/After School Prog."))



## -----------------------------------------------------------------------------------------------
hours_per_day <- Ch97 %>% filter(Q1H18>0) %>% summarise(mean(Q1H19/Q1H18,na.rm = TRUE))
hours_per_day[1,1]


## -----------------------------------------------------------------------------------------------

Ch02 <- readxl::read_excel("../../../data/data_PSID/cds/childcare/chcare_9707.xlsx") %>%
    mutate(KID = ER30001*1000 + ER30002) %>%
  mutate(Q21C15A = case_when(Q21C15A<9998 ~ Q21C15A),Q21C13=case_when(Q21C13<998 ~ Q21C13),Q21C15B=case_when(Q21C15B<8 ~ Q21C15B)) %>% #<- fill in missing values 
  mutate(Q21C21A = case_when(Q21C21A<998 ~ Q21C21A),Q21C19=case_when(Q21C19<998 ~ Q21C19),Q21C21B=case_when(Q21C21B<8 ~ Q21C21B)) %>% #<- fill in missing values for second childcare measure
  mutate(chcareExp02 = case_when(Q21C15B==1 ~ Q21C15A*Q21C13, #<- cost reported hourly
                                 Q21C15B==2 ~ Q21C15A*Q21C13/3.1, #<- reported daily?
                                 Q21C15B==4 ~ Q21C15A/2, #<- reported fortnightly
                                 Q21C15B==5 ~ Q21C15A/(52/12), #<- reported monthly
                                 Q21C15B==6 ~ Q21C15A/52,#<-reported annually
                                 TRUE ~ Q21C15A)) %>% #<- reported weekly
    mutate(chcareExpSecond02 = case_when(Q21C21B==1 ~ Q21C21A*Q21C19, #<- cost reported hourly
                                 Q21C21B==2 ~ Q21C21A*Q21C19/3.1, #<- reported daily?
                                 Q21C21B==4 ~ Q21C21A/2, #<- reported fortnightly
                                 Q21C21B==5 ~ Q21C21A/(52/12), #<- reported monthly
                                 Q21C21B==6 ~ Q21C21A/52,#<-reported annually
                                 TRUE ~ Q21C21A)) %>% #<- reported weekly
  mutate(Price02 = chcareExp02/Q21C13) %>% #<- derive hourly price
  mutate(PriceSecond02 = chcareExpSecond02/Q21C19) %>% #<- derive hourly price
  mutate(Arrange02 = case_when(Q21C12==1 | Q21C12==2  ~ "Relative",
                               Q21C12==3 ~ "Non Relative",
                               Q21C12==5 ~ "Family Center",
                               Q21C12==7 ~ "Care Center",
                               Q21C12==8 ~ "Before/After School Prog."))%>%
  mutate(ArrangeSecond02 = case_when(Q21C18==1 | Q21C18==2  ~ "Relative",
                               Q21C18==3 ~ "Non Relative",
                               Q21C18==5 ~ "Family Center",
                               Q21C18==7 ~ "Care Center",
                               Q21C18==8 ~ "Before/After School Prog."))



Ch07 <- readxl::read_excel("../../../data/data_PSID/cds/childcare/chcare_9707.xlsx") %>%
    mutate(KID = ER30001*1000 + ER30002) %>%
  mutate(Q31C15A = case_when(Q31C15A<9998 ~ Q31C15A),Q31C13=case_when(Q31C13<998 ~ Q31C13),Q31C15B=case_when(Q31C15B<8 ~ Q31C15B)) %>% #<- fill in missing values 
  mutate(Q31C21A = case_when(Q31C21A<998 ~ Q31C21A),Q31C19=case_when(Q31C19<998 ~ Q31C19),Q31C21B=case_when(Q31C21B<8 ~ Q31C21B)) %>% #<- fill in missing values for second childcare measure
  mutate(chcareExp07 = case_when(Q31C15B==1 ~ Q31C15A*Q31C13, #<- cost reported hourly
                                 Q31C15B==2 ~ Q31C15A*Q31C13/3.1, #<- reported daily?
                                 Q31C15B==4 ~ Q31C15A/2, #<- reported fortnightly
                                 Q31C15B==5 ~ Q31C15A/(52/12), #<- reported monthly
                                 Q31C15B==6 ~ Q31C15A/52,#<-reported annually
                                 TRUE ~ Q31C15A)) %>% #<- reported weekly
    mutate(chcareExpSecond07 = case_when(Q31C21B==1 ~ Q31C21A*Q31C19, #<- cost reported hourly
                                 Q31C21B==2 ~ Q31C21A*Q31C19/3.1, #<- reported daily?
                                 Q31C21B==4 ~ Q31C21A/2, #<- reported fortnightly
                                 Q31C21B==5 ~ Q31C21A/(52/12), #<- reported monthly
                                 Q31C21B==6 ~ Q31C21A/52,#<-reported annually
                                 TRUE ~ Q31C21A)) %>% #<- reported weekly
  mutate(Price07 = chcareExp07/Q31C13) %>% #<- derive hourly price
  mutate(PriceSecond07 = chcareExpSecond07/Q31C19) %>% #<- derive hourly price
  mutate(Arrange07 = case_when(Q31C12==1 | Q31C12==2  ~ "Relative",
                               Q31C12==3 ~ "Non Relative",
                               Q31C12==5 ~ "Family Center",
                               Q31C12==7 ~ "Care Center",
                               Q31C12==8 ~ "Before/After School Prog."))%>%
  mutate(ArrangeSecond07 = case_when(Q31C18==1 | Q31C18==2  ~ "Relative",
                               Q31C18==3 ~ "Non Relative",
                               Q31C18==5 ~ "Family Center",
                               Q31C18==7 ~ "Care Center",
                               Q31C18==8 ~ "Before/After School Prog."))



## -----------------------------------------------------------------------------------------------
     
# this function calculates the weekly rate given the data
CalcRate <- function(varname,rate,cost,days,hours,Ch) {
  I_miss <- Ch[,cost]>=99998
  Ch[I_miss,cost] <- NA
  Ch[,varname] <- Ch[,cost]
  # Case 0: Weekly
  Ic = (Ch[,rate]==3)
  Ch[Ic,varname] = Ch[Ic,cost]
  # Case 1: Hourly
  Ic = (Ch[,rate]==1)
  Ch[Ic,varname] = Ch[Ic,cost]*Ch[Ic,hours]
  # Case 2: Daily
  Ic = (Ch[,rate]==2)
  Ch[Ic,varname] = Ch[Ic,cost]*Ch[Ic,days]
  # Case 4: Fortnightly
  Ic = (Ch[,rate]==4)
  Ch[Ic,varname] = Ch[Ic,cost]/2
  # Case 5: Monthly
  Ic = (Ch[,rate]==5)
  Ch[Ic,varname] = Ch[Ic,cost]/4
  # Case 6: Annual
  Ic = (Ch[,rate]==6)
  Ch[Ic,varname] = Ch[Ic,cost]/52
  Ch
}


ch_hist <- readxl::read_excel("../../../data/data_PSID/cds/childcare/Childcare_history.xlsx") %>%
  mutate(KID = ER30001*1000 + ER30002)


for (i in seq(1,8)) {
  vname = paste("exp97_",i,sep="")
  rate = paste("Q1H7A_",i,sep="")
  cost = paste("Q1H7_",i,sep="")
  days = paste("Q1H5_",i,sep="")
  hours = paste("Q1H6_",i,sep="")
  # replace missing hours with NA:
  ch_hist[ch_hist[,hours]==998|ch_hist[,hours]==999,hours]= NA_real_
 #ch_hist[,hours] = na_if(ch_hist[,hours],998)
 #ch_hist[,hours] = na_if(ch_hist[,hours],999)
  ch_hist <- CalcRate(vname,rate,cost,days,hours,ch_hist)
}


ch_hist$Arrange <- -1
ch_hist$Freq <- 0
ch_hist$Exp97 <- -1
ch_hist$Hours97 <- -1
ch_hist$Exp97Tot <- 0
ch_hist$Hours97Tot <- 0

for (i in seq(1,nrow(ch_hist))) {
  if (ch_hist[i,"Q1H1Y"]==0) {
    ch_hist[i,"Arrange"] = 0
  }
  else {
    for (j in seq(1,8)) {
      if (ch_hist[i,paste("Q1H8_",j,"Y",sep="")]==96) {
        freq_var = paste("Q1H5_",j,sep="")
        ch_hist[i,"Exp97Tot"] = ch_hist[i,"Exp97Tot"] + ch_hist[i,paste("exp97_",j,sep="")]
        ch_hist[i,"Hours97Tot"] = ch_hist[i,"Hours97Tot"] + ch_hist[i,paste("Q1H6_",j,sep="")]
        days_per_week = ch_hist[i,freq_var]
        if (days_per_week>ch_hist[i,"Freq"] & days_per_week<8) {
          ch_hist[i,"Arrange"] = ch_hist[i,paste("Q1H4_",j,sep="")]
          ch_hist[i,"Freq"] = ch_hist[i,freq_var]
          ch_hist[i,"Exp97"] = ch_hist[i,paste("exp97_",j,sep="")]
          ch_hist[i,"Hours97"] = ch_hist[i,paste("Q1H6_",j,sep="")]
        }
      }
    }
  }
}

# now code up arrangements:
# - Childcare arrangements
ch_hist$Arrange97 <- NA
I1 <- ch_hist$Arrange==0
ch_hist$Arrange97[I1] <- "Never Cared for By Other"
I1 <- is.na(ch_hist$Arrange)
ch_hist$Arrange97[I1] <- "No Detectable Arrangement"
I1 <- ch_hist$Arrange==1 | ch_hist$Arrange==3
ch_hist$Arrange97[I1] <- "Relative"
I1 <- ch_hist$Arrange==2 | ch_hist$Arrange==4
ch_hist$Arrange97[I1] <- "Non Relative"
I1 <- ch_hist$Arrange==5
ch_hist$Arrange97[I1] <- "Head Start"
I1 <- ch_hist$Arrange==6
ch_hist$Arrange97[I1] <- "Care Center"
I1 <- ch_hist$Arrange==7
ch_hist$Arrange97[I1] <- "Before/After School Prog."
I1 <- ch_hist$Arrange==8
ch_hist$Arrange97[I1] <- "Self Care"
I1 <- ch_hist$Arrange>=97
ch_hist$Arrange97[I1] <- "Unkown Type"

# code a more basic breakdown:
ch_hist$CaretypeBroad <- "External"
ch_hist$CaretypeBroad[ch_hist$Arrange==0] <- "Never Care for by Other"
ch_hist$CaretypeBroad[is.na(ch_hist$Arrange)] <- "No Current Detectable"
  


## -----------------------------------------------------------------------------------------------
C97 <- Ch97[,c("KID","chcareExp97","Arrange97","Price97","chcareExpSecond97","ArrangeSecond97","PriceSecond97")]
C97$year <- 1997
C97 <- rename(C97,chcare = chcareExp97,Arrange = Arrange97, Price = Price97,
              chcare_second=chcareExpSecond97, ArrangeSecond = ArrangeSecond97, PriceSecond = PriceSecond97)
# now load preK data and merge, replacing the child care value with this value when available
C97 <- ch_hist %>%
  select(KID,Exp97Tot) %>%
  mutate(Exp97Tot = na_if(Exp97Tot,9998)) %>%
  merge(C97) %>%
  mutate(chcare = case_when(Exp97Tot>0 ~ Exp97Tot,TRUE ~ chcare)) %>%
  select(-Exp97Tot)


C02 <- Ch02[,c("KID","chcareExp02","Arrange02","Price02","chcareExpSecond02","ArrangeSecond02","PriceSecond02")]
C02$year <- 2002
C02 <- rename(C02,chcare = chcareExp02,Arrange = Arrange02, Price = Price02,
              chcare_second=chcareExpSecond02, ArrangeSecond = ArrangeSecond02, PriceSecond = PriceSecond02)

C07 <- Ch07[,c("KID","chcareExp07","Arrange07","Price07","chcareExpSecond07","ArrangeSecond07","PriceSecond07")]
C07$year <- 2007
C07 <- rename(C07,chcare = chcareExp07,Arrange = Arrange07, Price = Price07,
              chcare_second=chcareExpSecond07, ArrangeSecond = ArrangeSecond07, PriceSecond = PriceSecond07)


cds_childcare <- rbind(C97,C02,C07) 



## -----------------------------------------------------------------------------------------------

age_month <- readxl::read_excel("../../../data/data_PSID/cds/staff-child-ratios/age_month.xlsx") %>%
      mutate(KID = ER30001*1000 + ER30002) %>%
      rename(age_month_1997=AGEATPCG)%>%
      #replace full birthdate unknown values to missing
      mutate(age_month_1997=na_if(age_month_1997,999.9))%>%
      mutate(age_month_1997=floor(age_month_1997))%>%
      mutate(month_1997 =PCGIWYR  * 12 + PCGIWMON )%>%
      mutate(month_2002 = Q21IWYR * 12 + Q21IWMTH )%>%
      mutate(month_2007 =  Q31IWYR * 12 + Q31IWMTH)

#Compute age in months at 1997 interview
age_month_1997 <- age_month %>%
      select(KID, age_month_1997)%>%
      rename(age_month=age_month_1997)%>%
      mutate(year=1997)

#Compute age in months at 2002 interview using age in months in 1997 
#and time difference between 2002 and 1997 interviews

age_month_2002 <-  age_month %>%
      select(KID, age_month_1997, month_2002, month_1997)%>%
      mutate(age_month=age_month_1997+month_2002-month_1997, year=2002)%>%
      select(-month_2002, -month_1997, -age_month_1997)

#Compute age in months at 2007 interview using age in months in 1997 
#and time difference between 2007 and 1997 interviews

age_month_2007 <-  age_month %>%
      select(KID, age_month_1997, month_2007, month_1997)%>%
      mutate(age_month=age_month_1997+month_2007-month_1997, year=2007)%>%
      select(-month_2007, -month_1997, -age_month_1997)

#Make a panel for age in months 1997, 2002, and 2007
age_month <- rbind(age_month_1997,age_month_2002, age_month_2007)




## -----------------------------------------------------------------------------------------------
staff_ratio <- readxl::read_excel("../../../data/data_PSID/cds/staff-child-ratios/Ratios_Center_Based_2002.xlsx")


####################
#Age 0-8 months
####################
staff_ratio_0_8 <- staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:8) {
  temp <- staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  
  staff_ratio_0_8= rbind(staff_ratio_0_8,temp)
}

#Second version with cutoffs 6 months later
staff_ratio_0_14 <- staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:14) {
  temp <- staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  staff_ratio_0_14= rbind(staff_ratio_0_14,temp)
}




####################
#Age 9-17 months
####################
staff_ratio_9_17 <- staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=9)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 10:17) {
  temp <- staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_9_17=rbind(staff_ratio_9_17,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_15_23 <- staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=15)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 16:23) {
  temp <- staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_15_23=rbind(staff_ratio_15_23,temp)
}
####################
#Age 18-26
####################

staff_ratio_18_26 <- staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=18)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 19:26) {
  temp <- staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_18_26=rbind(staff_ratio_18_26,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_24_32 <- staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=24)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 25:32) {
  temp <- staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_24_32=rbind(staff_ratio_24_32,temp)
}

####################
#Age 27-35
####################


staff_ratio_27_35 <- staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=27)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 28:35) {
  temp <- staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  staff_ratio_27_35=rbind(staff_ratio_27_35,temp)
}


#Second version with cutoffs 6 months later

staff_ratio_33_41 <- staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=33)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 34:41) {
  temp <- staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  
  staff_ratio_33_41=rbind(staff_ratio_33_41,temp)
}

####################
#Age 36-47 (3 years)
####################


staff_ratio_36_47 <- staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=36)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 37:47) {
  temp <- staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_36_47=rbind(staff_ratio_36_47,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_42_53 <- staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=42)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 43:53) {
  temp <- staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_42_53=rbind(staff_ratio_42_53,temp)
}

####################
#Age 48-59 (4 years)
####################

staff_ratio_48_59 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=48)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 49:59) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_48_59=rbind(staff_ratio_48_59,temp)
}


#Second version with cutoffs 6 months later


staff_ratio_54_65 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=54)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 55:65) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_54_65=rbind(staff_ratio_54_65,temp)
}

####################
#Age 60-71 (5 years)
####################

staff_ratio_60_71 <- staff_ratio %>%
  select(state_str,month_60_71)%>%
  mutate(age_month=60)%>%
  rename(child_staff_ratio=month_60_71)

# Create a panel dataset
for (m in 61:71) {
  temp <- staff_ratio %>%
    select(state_str,month_60_71)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_71)
  
  staff_ratio_60_71=rbind(staff_ratio_60_71,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_66_77 <- staff_ratio %>%
  select(state_str,month_60_71)%>%
  mutate(age_month=66)%>%
  rename(child_staff_ratio=month_60_71)

# Create a panel dataset
for (m in 67:77) {
  temp <- staff_ratio %>%
    select(state_str,month_60_71)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_71)
  
  staff_ratio_66_77=rbind(staff_ratio_66_77,temp)
}

####################
#Age 72-83 (6 years)
####################


staff_ratio_72_83 <- staff_ratio %>%
  select(state_str,month_72_83)%>%
  mutate(age_month=72)%>%
  rename(child_staff_ratio=month_72_83)

# Create a panel dataset
for (m in 73:83) {
  temp <- staff_ratio %>%
    select(state_str,month_72_83)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_72_83)
  
  staff_ratio_72_83=rbind(staff_ratio_72_83,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_78_89 <- staff_ratio %>%
  select(state_str,month_72_83)%>%
  mutate(age_month=78)%>%
  rename(child_staff_ratio=month_72_83)

# Create a panel dataset
for (m in 79:89) {
  temp <- staff_ratio %>%
    select(state_str,month_72_83)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_72_83)
  
  staff_ratio_78_89=rbind(staff_ratio_78_89,temp)
}

####################
#Age 84-95 (7 years)
####################

staff_ratio_84_95 <- staff_ratio %>%
  select(state_str,month_84_95)%>%
  mutate(age_month=84)%>%
  rename(child_staff_ratio=month_84_95)

# Create a panel dataset
for (m in 85:95) {
  temp <- staff_ratio %>%
    select(state_str,month_84_95)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_84_95)
  
  staff_ratio_84_95=rbind(staff_ratio_84_95,temp)
}

#Second version with cutoffs 6 months later
staff_ratio_90_101 <- staff_ratio %>%
  select(state_str,month_84_95)%>%
  mutate(age_month=90)%>%
  rename(child_staff_ratio=month_84_95)

# Create a panel dataset
for (m in 91:101) {
  temp <- staff_ratio %>%
    select(state_str,month_84_95)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_84_95)
  staff_ratio_90_101=rbind(staff_ratio_90_101,temp)
}

####################
#Age 96-119 (8-9 years old)
####################


staff_ratio_96_119 <- staff_ratio %>%
  select(state_str,month_96_119)%>%
  mutate(age_month=96)%>%
  rename(child_staff_ratio=month_96_119)

# Create a panel dataset
for (m in 97:119) {
  temp <- staff_ratio %>%
    select(state_str,month_96_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_96_119)
  staff_ratio_96_119=rbind(staff_ratio_96_119,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_102_125 <- staff_ratio %>%
  select(state_str,month_96_119)%>%
  mutate(age_month=102)%>%
  rename(child_staff_ratio=month_96_119)

# Create a panel dataset
for (m in 103:125) {
  temp <- staff_ratio %>%
    select(state_str,month_96_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_96_119)
  
  staff_ratio_102_125 =rbind(staff_ratio_102_125 ,temp)
}


####################
#Age 120+ (10+ years)
####################


staff_ratio_120 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=120)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 121:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_120=rbind(staff_ratio_120,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_126 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=126)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 127:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_126=rbind(staff_ratio_126,temp)
}


panel_staff_ratio_6m <- rbind(staff_ratio_0_14, staff_ratio_15_23, staff_ratio_24_32, staff_ratio_33_41, staff_ratio_42_53, 
                              staff_ratio_54_65, staff_ratio_66_77, staff_ratio_78_89,staff_ratio_90_101 , 
                              staff_ratio_102_125,staff_ratio_126)%>%
                              #generate staff-to-child from child-to-staff ratio with 6 months lag
                              mutate(staff_ratio_6m=1/child_staff_ratio)%>%
                              select(-child_staff_ratio)  

panel_staff_ratio_2002 <- rbind(staff_ratio_0_8, staff_ratio_9_17, staff_ratio_18_26, staff_ratio_27_35, staff_ratio_36_47, 
                                staff_ratio_48_59, staff_ratio_60_71, staff_ratio_72_83,staff_ratio_84_95 , 
                                staff_ratio_96_119,staff_ratio_120)%>%
                                #generate staff-to-child from child-to-staff ratio
                                mutate(staff_ratio=1/child_staff_ratio)%>%
                                select(-child_staff_ratio) %>%
                                merge(panel_staff_ratio_6m)%>%
                                mutate(year=2002)

#Use 2002 ratios for 1997 interview
panel_staff_ratio_1997 <- panel_staff_ratio_2002%>%
mutate(year=1997)


#Delete all objects that start with staff_ratio to generate a 2007 panel

# List all objects in the current environment
objects <- ls()

# Identify objects starting with "staff_ratio"
objects_to_delete <- objects[grep("^staff_ratio", objects)]

# Delete objects starting with "staff_ratio"
rm(list = objects_to_delete)

###############################################################
##########################2007 ratios##########################
###############################################################

#Create a panel of mandatory staff-child ratio by state/age of child in months

staff_ratio <- readxl::read_excel("../../../data/data_PSID/cds/staff-child-ratios/Ratios_Center_Based_2007.xlsx")


####################
#Age 0-8 months
####################
staff_ratio_0_8 <-  staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:8) {
  temp <-  staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  
  staff_ratio_0_8= rbind( staff_ratio_0_8,temp)
}

#Second version with cutoffs 6 months later
staff_ratio_0_14 <-  staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:14) {
  temp <-  staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  staff_ratio_0_14= rbind(staff_ratio_0_14,temp)
}




####################
#Age 9-17 months
####################
staff_ratio_9_17 <-  staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=9)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 10:17) {
  temp <-  staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_9_17=rbind(staff_ratio_9_17,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_15_23 <-  staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=15)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 16:23) {
  temp <-  staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_15_23=rbind(staff_ratio_15_23,temp)
}
####################
#Age 18-26
####################

staff_ratio_18_26 <-  staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=18)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 19:26) {
  temp <-  staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_18_26=rbind(staff_ratio_18_26,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_24_32 <-  staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=24)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 25:32) {
  temp <-  staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_24_32=rbind(staff_ratio_24_32,temp)
}

####################
#Age 27-35
####################


staff_ratio_27_35 <-  staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=27)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 28:35) {
  temp <-  staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  staff_ratio_27_35=rbind(staff_ratio_27_35,temp)
}


#Second version with cutoffs 6 months later

staff_ratio_33_41 <-  staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=33)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 34:41) {
  temp <-  staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  
  staff_ratio_33_41=rbind(staff_ratio_33_41,temp)
}

####################
#Age 36-47 (3 years)
####################


staff_ratio_36_47 <-  staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=36)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 37:47) {
  temp <-  staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_36_47=rbind(staff_ratio_36_47,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_42_53 <-  staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=42)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 43:53) {
  temp <- staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_42_53=rbind(staff_ratio_42_53,temp)
}

####################
#Age 48-59 (4 years)
####################

staff_ratio_48_59 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=48)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 49:59) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_48_59=rbind(staff_ratio_48_59,temp)
}


#Second version with cutoffs 6 months later


staff_ratio_54_65 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=54)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 55:65) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_54_65=rbind(staff_ratio_54_65,temp)
}

####################
#Age 60-119 (5-10 years)
####################

staff_ratio_60_119 <- staff_ratio %>%
  select(state_str,month_60_119)%>%
  mutate(age_month=60)%>%
  rename(child_staff_ratio=month_60_119)

# Create a panel dataset
for (m in 61:119) {
  temp <- staff_ratio %>%
    select(state_str,month_60_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_119)
  
  staff_ratio_60_119=rbind(staff_ratio_60_119,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_66_125 <- staff_ratio %>%
  select(state_str,month_60_119)%>%
  mutate(age_month=66)%>%
  rename(child_staff_ratio=month_60_119)

# Create a panel dataset
for (m in 67:125) {
  temp <- staff_ratio %>%
    select(state_str,month_60_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_119)
  
  staff_ratio_66_125=rbind(staff_ratio_66_125,temp)
}


####################
#Age 120+ (10+ years)
####################


staff_ratio_120 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=120)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 121:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_120=rbind(staff_ratio_120,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_126 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=126)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 127:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_126=rbind(staff_ratio_126,temp)
}





panel_staff_ratio_6m_2007 <- rbind(staff_ratio_0_14, staff_ratio_15_23, staff_ratio_24_32, staff_ratio_33_41, staff_ratio_42_53, 
                                   staff_ratio_54_65, staff_ratio_66_125,staff_ratio_126)%>%
                                    #generate staff-to-child from child-to-staff ratio with 6 months lag
                                    mutate(staff_ratio_6m=1/child_staff_ratio)%>%
                                    select(-child_staff_ratio)  

panel_staff_ratio_2007 <- rbind(staff_ratio_0_8, staff_ratio_9_17, staff_ratio_18_26, staff_ratio_27_35, staff_ratio_36_47, 
                                staff_ratio_48_59, staff_ratio_60_119,staff_ratio_120)%>%
                                #generate staff-to-child from child-to-staff ratio
                                mutate(staff_ratio=1/child_staff_ratio)%>%
                                select(-child_staff_ratio) %>%
                                merge(panel_staff_ratio_6m_2007)%>%
                                mutate(year=2007)


#Delete all objects that start with staff_ratio again

# List all objects in the current environment
objects <- ls()

# Identify objects starting with "staff_ratio"
objects_to_delete <- objects[grep("^staff_ratio", objects)]

# Delete objects starting with "staff_ratio"
rm(list = objects_to_delete)


#Merge all  panels with ratios together
staff_ratio_panel <- rbind(panel_staff_ratio_1997, panel_staff_ratio_2002, panel_staff_ratio_2007)



## -----------------------------------------------------------------------------------------------
# get an index of CDS children
KID <- assessment_panel %>%
  select(KID) %>%
  unique()


## -----------------------------------------------------------------------------------------------
child_record <- read.csv("../../../data/data_PSID/main/childbirth/Childbirth.csv") %>%
  mutate(X = NULL) %>% #<- load childbirth file
  filter(CAH5 == 2) %>% # keep only women
  filter(CAH10 != 0) %>% # drop women who have no children according to this file
  filter(CAH7 != 9998) %>% # drop women with missing birth year info
  mutate(MID = CAH3 * 1000 + CAH4, KID = CAH10 * 1000 + CAH11) %>%
  group_by(MID) %>%
  filter(!any(CAH10 == 9999))


## -----------------------------------------------------------------------------------------------

mother_index <- child_record %>%
  select(MID, KID) %>%
  inner_join(KID) %>%
  select(MID) %>%
  unique()

C <- child_record %>% #<- this matches the sample from before. All good.
  inner_join(mother_index)


## -----------------------------------------------------------------------------------------------
MakePanel <- function(d) {
  year <- (d$CAH7[1] + 16):2017
  n_y <- length(year)
  knum <- d$CAH108[1]
  age_youngest <- integer(n_y) - 1
  age_oldest <- integer(n_y) + Inf
  num_0_5 <- integer(n_y)
  num_6_12 <- integer(n_y)
  num_child <- integer(n_y)
  m_age <- year - d$CAH7[1]
  for (i in 1:length(year)) {
    ay <- Inf
    ao <- -1
    a5 <- 0
    a6 <- 0
    for (k in 1:nrow(d)) {
      agek <- year[i] - d$CAH15[k]
      if (agek >= 0 && agek <= 18) {
        ay <- min(ay, agek)
        ao <- max(ao, agek)
        num_child[i] <- num_child[i] + 1
      }
      if (agek >= 0 && agek <= 5) {
        a5 <- a5 + 1
      }
      if (agek > 5 && agek <= 12) {
        a6 <- a6 + 1
      }
    }
    age_youngest[i] <- ay
    age_oldest[i] <- ao
    num_0_5[i] <- a5
    num_6_12[i] <- a6
  }
  data.frame(MID = d$MID[1], m_age = m_age, year = year, knum = knum, num_child = num_child, age_youngest = age_youngest, age_oldest = age_oldest, num_0_5 = num_0_5, num_6_12 = num_6_12, y_first = d$CAH15[1])
}


## -----------------------------------------------------------------------------------------------

mother_panel <- C %>%
  group_by(MID) %>%
  do(MakePanel(.)) %>%
  filter(y_first >= 1960) %>%
  left_join(passage_comprehension) # Add mother's cognitive score



## -----------------------------------------------------------------------------------------------
Ind <- identifiers_panel %>%
  mutate(MID = intnum68 * 1000 + pernum, mar_stat = mpair > 0)


## -----------------------------------------------------------------------------------------------
Ind <- Ind %>%
  filter(year >= 1999) %>%
  mutate(year = year - 1) %>%
  rbind(Ind)


## -----------------------------------------------------------------------------------------------
# this will be useful throughout: create a dataframe that maps mothers to their interview number and sequence number
D_Ind <- mother_panel %>%
  inner_join(Ind)


## -----------------------------------------------------------------------------------------------
main_childcare <- main_childcare %>%
  inner_join(D_Ind) %>% #<- merge to mothers through the panel file
  mutate(year = year - 1) %>% #<- convert survey year to lag year
  select(MID, year, childcare_exp)

## -----------------------------------------------------------------------------------------------
mother_panel <- main_state %>%
  right_join(Ind) %>% #<- merge with Individual file
  right_join(mother_panel) %>% #<- merge with the panel we created (keeping all years)
  arrange(MID, year) %>%
  group_by(MID) %>%
  fill(state, .direction = "downup") %>% #<- fill in missing observations
  merge(state_codes) %>% #<- merge in the cross-walk with FIPS codes and SOI codes
  left_join(main_childcare) #<= use only the main file to link


## -----------------------------------------------------------------------------------------------
mother_panel <- mother_panel %>%
  select(MID, year, m_age) %>%
  right_join(Ind) %>% #<- merge with individual file, keep all observations from Ind
  filter(is.na(m_age)) %>% #<- identify spouses by missing info from the merge
  rename(FID = MID, snF = sn) %>% #<- rename variables for merge
  filter(mpair > 0) %>% #<- only keep individuals who are not mothers but who are in a relationship in the same household
  select(FID, year, intnum, mpair, snF) %>%
  right_join(mother_panel) %>% #<- do the merge
  mutate(snF = replace_na(snF, -1)) #<- replace missing values.


## -----------------------------------------------------------------------------------------------
#Load miles to parents in 1988

d <- parent_distance %>%
  rename(intnum88=intnum)

#Match miles to parents in 1988 by family intnum for mothers and fathers
id <- Ind  %>% 
  filter(year==1988) %>%
  rename(id = MID)%>%
  select(id, sn, intnum)%>%
  rename(intnum88=intnum)

# mothers: use mid from the main sample, assign distance based on family intnum and sn
mother_distance <- mother_panel %>%
  select(MID) %>%
  unique() %>%
  merge(id,by.x="MID",by.y="id") %>%
  merge(d) %>%
  mutate(m_parents_hundred = case_when(sn==1 ~ head_hundred_miles,sn==2 ~ sp_hundred_miles),
         m_parents_ten = case_when(sn==1 ~ head_ten_miles,sn==2 ~ sp_ten_miles)) %>%
  select(MID,m_parents_hundred,m_parents_ten)


# fathers: use fid from the main sample, assign distance based on family intnum and sn
father_distance <- mother_panel %>%
  select(FID) %>%
  drop_na() %>%
  unique() %>%
  merge(id,by.x="FID",by.y="id") %>%
  merge(d) %>%
  mutate(f_parents_hundred = case_when(sn==1 ~ head_hundred_miles,sn==2 ~ sp_hundred_miles),
         f_parents_ten = case_when(sn==1 ~ head_ten_miles,sn==2 ~ sp_ten_miles)) %>%
  select(FID,f_parents_hundred,f_parents_ten)



## -----------------------------------------------------------------------------------------------
# #  ---- load marriage info and create some summary variables
M <- read.csv("../../../data/data_PSID/main/marriage/Marriage.csv") %>%
  mutate(X = NULL) %>%
  rename(MID = ID1) %>% #<- we rename to MID in preparation for a first merge with sample mothers
  rename(ybirth = MH6) %>%
  filter(ybirth < 9998) %>% #<- drop individuals with missing birth info
  rename(ymar = MH11) %>%
  filter(ymar != 9998) %>% #<- remove individuals without marriage info
  mutate(record = ymar < 9999) %>% #<- ymar==9999 indicates never married
  group_by(MID) %>%
  summarise(ever_married = sum(record) > 0, ybirth = ybirth[1])

mother_panel <- M %>%
  inner_join(mother_panel)


## -----------------------------------------------------------------------------------------------

mother_panel <- D_Ind %>%
  select(MID, year, m_age) %>%
  right_join(Ind) %>%
  filter(is.na(m_age)) %>% #<- pick out individuals from the merge who were *not* matched
  rename(FID = MID, snF = sn) %>%
  select(c("FID", "year", "intnum", "mpair", "snF")) %>%
  filter(mpair > 0) %>%
  right_join(mother_panel)

mother_panel <- M %>%
  rename(FID = MID, ybirthF = ybirth) %>%
  right_join(mother_panel) %>%
  mutate(f_age = year - ybirthF)



## -----------------------------------------------------------------------------------------------
# normalized to year 2002
cpi = read.csv("../../../data/data_PSID/CPI-U.csv") %>%
  rename(year = YEAR) %>%
  mutate(CPIU = CPIU/CPIU[56])


## -----------------------------------------------------------------------------------------------
mother_panel <- mother_panel %>% #<- drop if mother has missing race or education info
  inner_join(education, by = c("MID" = "ID")) %>%
  rename(m_ed = ed) %>%
  left_join(education, by = c("FID" = "ID")) %>%
  rename(f_ed = ed) %>%
  left_join(race, by = c("MID" = "ID")) %>%
  rename(m_race = Race) %>%
  left_join(race, by = c("FID" = "ID")) %>%
  rename(f_race = Race)%>%
  left_join(cpi)


## -----------------------------------------------------------------------------------------------
earnings_panel <- mother_panel %>%
  select(MID, FID, year, intnum, sn, snF) %>%
  inner_join(earnings_panel) %>% #<- merge based on year and interview number
  mutate(m_hrs = case_when(sn == 1 ~ hours_head, sn == 2 ~ hours_spouse), m_earn = case_when(sn == 1 ~ earn_head, sn == 2 ~ earn_spouse)) %>%
  mutate(f_hrs = case_when(snF == 1 ~ hours_head, snF == 2 ~ hours_spouse), f_earn = case_when(snF == 1 ~ earn_head, snF == 2 ~ earn_spouse)) %>%
  mutate(m_wage = m_earn / m_hrs, f_wage = f_earn / f_hrs) %>%
  select(MID, year, m_earn, m_hrs, m_wage, f_earn, f_hrs, f_wage) %>% #<- we don't keep FID because the coupling MID,FID is dynamic and may not match the pair (MID,FID) from the previous year when we merge these data back in
  mutate(year = year - 1) #<- these variable refer to the previous year so we subtract one


## -----------------------------------------------------------------------------------------------
# we link first to mothers:
T2m <- mother_panel %>%
  select(MID) %>%
  unique() %>%
  inner_join(earnings_panel_supplement) %>% #<- first four lines pick out mothers in CDS sample
  rename(m_earn = earn, m_hrs = hrs) %>%
  mutate(m_wage = m_earn / m_hrs) %>%
  select(MID, year, m_earn, m_wage, m_hrs) #<- these are mothers so drop the FID variable

# and then to fathers
T2f <- mother_panel %>%
  select(FID) %>%
  unique() %>%
  inner_join(earnings_panel_supplement) %>% #<-first four lines pick out "fathers" in sample (recall we are labelling all spouses as fathers)
  rename(f_earn = earn, f_hrs = hrs) %>%
  mutate(f_wage = f_earn / f_hrs) %>%
  select(FID, year, f_earn, f_wage, f_hrs) #<- these are fathers so drop the MID variable

# now we link the pair (MID,FID,year) first to mothers (MID,year) and then to fathers (MID,year), *then* we append all of this to the data from the main interview


## -----------------------------------------------------------------------------------------------

mother_panel <- mother_panel %>%
  left_join(total_income)


## -----------------------------------------------------------------------------------------------
mother_panel <- mother_panel %>%
  select(MID, FID, year) %>%
  inner_join(T2m) %>%
  left_join(T2f) %>%
  select(-FID) %>%
  rbind(earnings_panel) %>% #<- add all of this to the original data frame with the interview year questions
  mutate(house_earn = case_when(!is.na(f_earn) ~ f_earn + m_earn, TRUE ~ m_earn), m_wage = na_if(na_if(m_wage, Inf), 0), f_wage = na_if(na_if(f_wage, Inf), 0)) %>%
  right_join(mother_panel) # %>%

#load occupation data ad assign occupation to wife and husband by interview number and sequence number
occupations <- read.csv("../../../data/data_PSID/main/occupation/output/occ.csv")

mother_panel <- mother_panel %>%
 left_join(occupations)%>%
 mutate(m_occ_major = case_when(sn == 1 ~ occ_major_h, sn == 2 ~ occ_major_s),
        m_occ_minor = case_when(sn == 1 ~ occ_minor_h, sn == 2 ~ occ_minor_s),
        f_occ_major = case_when(snF == 1 ~ occ_major_h, snF == 2 ~ occ_major_s),
        f_occ_minor = case_when(snF == 1 ~ occ_minor_h, snF == 2 ~ occ_minor_s))



## -----------------------------------------------------------------------------------------------
mother_panel <- mother_panel %>% #set to missing age of children if no children
      mutate(
      age_oldest = replace(age_oldest, which(num_child==0), NA_real_),
      age_youngest = replace(age_youngest, which(num_child==0), NA_real_),
#set to missing earnings if negative
      m_earn = replace(m_earn, which(m_earn<0), NA_real_),
      f_earn = replace(f_earn, which(f_earn<0), NA_real_),
      m_wage = replace(m_wage, which(m_wage<0), NA_real_),
      f_wage = replace(f_wage, which(f_wage<0),NA_real_),
      house_earn = replace(house_earn, which(house_earn<0), NA_real_)
    )


## -----------------------------------------------------------------------------------------------
mother_panel <- mother_panel %>% 
    mutate(
    # generate 0/1 indicator for current marrital status
    curr_married=case_when(
      mar_stat == "TRUE" ~ 1, 
      mar_stat == "FALSE" ~ 0),
    #generate race indicators for mothers
    m_white = ifelse(m_race==1,1,0), 
    m_black=ifelse(m_race==2,1,0), 
    m_r_oth=ifelse(m_race>2,1,0),
    #generate race indicators for fathers
    f_white = ifelse(f_race==1,1,0), 
    f_black=ifelse(f_race==2,1,0), 
    f_r_oth=ifelse(f_race>2,1,0),
    #generate education indicators for fathers
    fed_hsd = ifelse(f_ed== "<12",1,0),
    fed_hs = ifelse(f_ed== "12",1,0),
    fed_scoll = ifelse(f_ed== "13-15",1,0),
    fed_coll = ifelse(f_ed== "16",1,0),
    fed_postcol = ifelse(f_ed== ">16",1,0),
    fed_collplus = fed_coll+fed_postcol,
    fed_scollplus =fed_scoll+ fed_coll+fed_postcol,
    #generate education indicators for mothers
    med_hsd = ifelse(m_ed== "<12",1,0),
    med_hs = ifelse(m_ed== "12",1,0),
    med_scoll = ifelse(m_ed== "13-15",1,0),
    med_coll = ifelse(m_ed== "16",1,0),
    med_postcol  = ifelse(m_ed== ">16",1,0),
    med_collplus = med_coll+med_postcol,
    med_scollplus = med_scoll+med_coll+med_postcol,
    #generate a variable measuring number of children younger than 12
    num_0_12=num_0_5+num_6_12,
    #mom age at oldest child's birth
    momageatbirth1 = m_age - age_oldest,
    # create measures of potential work experience
    m_exper = case_when( med_scoll == 1~m_age-20, med_coll == 1 ~ m_age-22, med_postcol == 1 ~ m_age-24, TRUE ~m_age-18),
    f_exper = case_when ( fed_scoll == 1~ f_age-20, fed_coll == 1 ~ f_age-22, fed_postcol == 1 ~ f_age-24, TRUE ~f_age-18),
    m_exper2 = m_exper*m_exper,
    f_exper2 = f_exper*f_exper)

#Generate non-labour income measure for married and single parents
mother_panel <- mother_panel %>%
  mutate(non_lab_income= case_when(
    curr_married == 1 ~ total_income-m_earn-f_earn,
    curr_married == 0 ~ total_income-m_earn)
    ) %>%
  mutate(non_lab_income=non_lab_income/52) #annual to weekly



## -----------------------------------------------------------------------------------------------
mother_panel <- mother_panel %>%
  #mom is between 16 and 45 at birth of the oldest child (or missing)%>%
  mutate(
    ind_not_sample=case_when(
      momageatbirth1>=16 & momageatbirth1<=45 ~ 0,
      is.na(momageatbirth1)==1~0, 
      TRUE~1)) %>%
    #mom and dad are between 18 and 65 or age missing
  mutate(
    ind_not_sample=case_when(
      m_age>=18&m_age<=65 ~ ind_not_sample,
      is.na(m_age)==1~ind_not_sample, 
      TRUE~1)
      ) %>%
  mutate(
    ind_not_sample=case_when(
      f_age>=18&f_age<=65 ~ ind_not_sample,
      is.na(f_age)==1 ~ ind_not_sample, TRUE~1)
      )


## -----------------------------------------------------------------------------------------------
#Calculate quantiles for observations that will remain in the sample (ind_not_sample==0) by year\broad education group

q_m <- mother_panel %>%
  filter(ind_not_sample==0)%>%
  group_by(year, med_scollplus) %>%
  summarize(
    qm1 = quantile(m_wage, 0.01, na.rm = TRUE),
    qm99 = quantile(m_wage, probs = 0.99, na.rm = TRUE))

q_f <- mother_panel %>%
  filter(ind_not_sample==0)%>%
  group_by(year, fed_scollplus) %>%
  summarize(
    qf1 = quantile(f_wage, 0.01, na.rm = TRUE),
    qf99 = quantile(f_wage, probs = 0.99, na.rm = TRUE))

mother_panel <- mother_panel %>%
  left_join(q_m) %>%
  left_join(q_f) %>%
  mutate(m_wage=case_when(m_wage >= qm1 & m_wage <= qm99 ~ m_wage, TRUE~ NA_real_)) %>%
  mutate(f_wage=  case_when(f_wage >= qf1 & f_wage <= qf99 ~ f_wage, TRUE~ NA_real_)) %>%
  select(-qf1,-qf99,-qm1,-qm99)

#Deflate wages, annual earning,  non-labour income to 2002 dollars 
mother_panel <- mother_panel %>%
  mutate(m_wage=m_wage/CPIU,
          f_wage=f_wage/CPIU,
          m_earn=m_earn/CPIU,
          f_earn=f_earn/CPIU,
          total_income=total_income/CPIU,
          tax_income=tax_income/CPIU,
          non_lab_income=non_lab_income/CPIU)


# Calculate log wage measures for trimmed wages
mother_panel <- mother_panel %>% 
  mutate(
        ln_wage_m=log(m_wage),
        ln_wage_f=log(f_wage),
        ln_wage_fm_ratio = ln_wage_f - ln_wage_m)


## -----------------------------------------------------------------------------------------------
write.csv(mother_panel, "../../../data/data_derived/MotherPanelCDS.csv")
write.dta(mother_panel, "../../../data/data_derived/MotherPanelCDS.dta")


## -----------------------------------------------------------------------------------------------
child_panel <- KID %>%
  inner_join(C) %>%
  group_by(KID) %>%
  filter(sum(CAH2 == 2) == 0) %>% # drop children with adoption records
  rename(ybirth_child = CAH15) %>%
  mutate(ybirth_child = na_if(ybirth_child, 9998)) %>%
  select(MID, KID, ybirth_child) %>%
  merge(mother_panel) %>%
  mutate(age = year - ybirth_child) %>%
  left_join(assessment_panel) %>%
  left_join(cds_expenditures) %>%
  left_join(cds_childcare) %>%
  left_join(TD_agg) %>%
  left_join(PPE) %>%
  left_join(age_month)%>%
  left_join(staff_ratio_panel)%>%#staff to child ratios by age of child in months
  left_join(relative_present)%>%
  filter(year>=1997,year<=2007)



#Add mobility data and 1988 distance to parents
child_panel <- child_panel %>%
  left_join(head_grow_up) %>%
  left_join(mother_distance) %>%
  left_join(father_distance) 


child_panel <- child_panel %>% #generate HH weekly childcare expenditure per child
  mutate(chcare_hh=childcare_exp/52,
         chcare_hh_pc=chcare_hh/num_0_12,
         #generate total goods expenditures
         hhinvest = SchSupplies + sports + lessons + comm_grps + tutoring + Toys)

child_panel <- child_panel %>%
  #generate age at birth of the child
  mutate(momageatbirth=m_age-age)



# Sample restriction: only households with children age 0-18
child_panel <- child_panel %>%
  mutate(ind_not_sample=case_when((num_child>0 & age>=0 & age<=18) ~ ind_not_sample, TRUE~1))

#Deflate childcare and goods expenditures to 2002 dollars 
 child_panel <- child_panel %>%
    mutate(chcare=chcare/CPIU,
           chcare_second=chcare_second/CPIU,
           chcare_hh_pc=chcare_hh_pc/CPIU,
           chcare_hh =chcare_hh /CPIU,
           hhinvest=hhinvest/CPIU)


## -----------------------------------------------------------------------------------------------
write.csv(child_panel,"../../../data/data_derived/ChildPanelCDS.csv")
write.dta(child_panel, "../../../data/data_derived/ChildPanelCDS.dta")


